package com.sifang.POI.dianping;

import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlcleaner.CleanerProperties;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.PrettyHtmlSerializer;
import org.htmlcleaner.TagNode;
import org.htmlcleaner.XPatherException;

public class WebUtil {
	
	private static CleanerProperties htmlCleanerProps;
	private static HtmlCleaner htmlCleaner;
	
	public static final String DIANPING_ROOT = "http://www.dianping.com";
	
	static {
		htmlCleanerProps = new CleanerProperties();
		htmlCleanerProps.setTranslateSpecialEntities(true);
		htmlCleanerProps.setTransResCharsToNCR(true);
		htmlCleanerProps.setOmitComments(true);
		htmlCleaner = new HtmlCleaner(htmlCleanerProps);
	}
	
	public static String getFullUrl(String urlString) {
		return DIANPING_ROOT + urlString;
	}
	
	public static TagNode cleanUrl(String urlString) throws IOException {
		URL url = new URL(urlString);
		HttpURLConnection connection = (HttpURLConnection) url.openConnection();
		connection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:11.0) Gecko/20100101 Firefox/11.0");
		return htmlCleaner.clean(connection.getInputStream(), "UTF-8");
	}
	
	public static void saveToFile(TagNode node, String fileName) throws IOException {
		new PrettyHtmlSerializer(htmlCleanerProps).writeToFile(node, fileName, "UTF-8");
	}
	
	public static void saveToFile(String urlString, String fileName) throws IOException {
		saveToFile(cleanUrl(urlString), fileName);
	}

	/**
	 * @param args
	 * @throws IOException 
	 * @throws XPatherException 
	 */
	public static void main(String[] args) throws IOException, XPatherException {
		saveToFile("http://www.dianping.com/search/category/4/10/g1496r27p1", "c:\\bug1.xml");
	}

}
